Quality Control & EDA

All samples (Before SVA)

Sample clustering

sampleDists <- dist(t(assay(rld)))
plot(hclust(sampleDists))

Correlation Heatmap

annot = dplyr::select(experimental_metadata, condition)
row.names(annot) = experimental_metadata$sample_id
rld %>%
  assay() %>%
  cor() %>%
  pheatmap(color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(100),
           annotation = annot,
           annotation_colors = list(condition = c(Control = "#BF3100", OKSM = "#E9B44C", 
                                                  Senolytic = "#1B998B", Senolytic_OKSM = "#5D576B")),
           cluster_rows = TRUE,
           cluster_cols = T,
           cellwidth = 13,
           cellheight = 13)

PCA

ntop = 500
rv <- rowVars(assay(rld))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
pca = prcomp(t(assay(rld)[select,]))
percentVar <- pca$sdev^2/sum(pca$sdev^2)

pca_data <- plotPCA(rld, intgroup = c("condition", "replicate"), returnData=TRUE)
percentVar <- round(100 * attr(pca_data, "percentVar"), digits=2)
ggplot(pca_data, aes(PC1, PC2, color=condition, shape=replicate)) + geom_point(size=3) +
  scale_x_continuous(paste0("PC1: ",percentVar[1],"% variance")) +
  scale_y_continuous(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed() + theme_classic() + geom_text(data = pca_data, aes(PC1,PC2, label = name), hjust = 1.2)

SVA

From the hierarchical clustering, correlation heatmap, and PCA, there seems to be some variation that we’re not accounting for. Here, we use SVA to account for the unknown variation. Since the number of variables is unknown, we call the sva function without the n.sv argument, allowing the algorithm to estimate the number of factors.

# There are batch effects for this dataset. However, running ~ batch + condition gives us an error that says the model matrix is not full rank. So we follow this guide https://master.bioconductor.org/packages/release/workflows/vignettes/rnaseqGene/inst/doc/rnaseqGene.html#removing-hidden-batch-effects
library(sva)
dat  <- counts(dds, normalized = TRUE)
idx  <- rowMeans(dat) > 1
dat  <- dat[idx, ]
mod  <- model.matrix(~ condition, colData(rld))
mod0 <- model.matrix(~ 1, colData(rld))
svseq <- svaseq(dat, mod, mod0)
## Number of significant surrogate variables is:  2 
## Iteration (out of 5 ):1  2  3  4  5

According to SVA, there are 2 significant surrogate variables.

par(mfrow = c(2, 1), mar = c(3,5,3,1))
for (i in 1:2) {
  stripchart(svseq$sv[, i] ~ dds$condition, vertical = TRUE, main = paste0("SV", i))
  abline(h = 0)
 }

After accounting for the 2 significant surrogate variables, this is what the data looks like:

Sample clustering

sampleDists <- dist(t(assay(rld)))
plot(hclust(sampleDists))

Correlation

rld %>%
  assay() %>%
  cor() %>%
  pheatmap(color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(100),
           annotation = annot,
           annotation_colors = list(condition = c(Control = "#BF3100", OKSM = "#E9B44C", 
                                                  Senolytic = "#1B998B", Senolytic_OKSM = "#5D576B")),
           cluster_rows = TRUE,
           cluster_cols = T,
           cellwidth = 13,
           cellheight = 13)

PCA

ntop = 500
rv <- rowVars(assay(rld))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
pca = prcomp(t(assay(rld)[select,]))
percentVar <- pca$sdev^2/sum(pca$sdev^2)

pca_data <- plotPCA(rld, intgroup = c("condition", "replicate"), returnData=TRUE)
percentVar <- round(100 * attr(pca_data, "percentVar"), digits=2)
ggplot(pca_data, aes(PC1, PC2, color=condition, shape=replicate)) + geom_point(size=3) +
  scale_x_continuous(paste0("PC1: ",percentVar[1],"% variance")) +
  scale_y_continuous(paste0("PC2: ",percentVar[2],"% variance")) +
  coord_fixed() + theme_classic() + geom_text(data = pca_data, aes(PC1,PC2, label = name), hjust = 1.2)

Removing TPM Outliers

effective_lengths = matrix(0, ncol=length(experimental_metadata$sample_id), nrow=17714)
colnames(effective_lengths)= experimental_metadata$sample_id
for( i in experimental_metadata$sample_id){
  effective_lengths[,i] = read.table(paste("data/ubiquitous/", i, ".genes.results",sep=""), sep="\t", header=TRUE)$effective_length
}
row.names(effective_lengths) = read.table(paste("data/ubiquitous/", i, ".genes.results",sep=""), sep="\t", header=TRUE)$gene_id

effective_lengths = rowMeans(effective_lengths[row.names(counts(ddssva)),])
ncrpk = counts(ddssva) / (effective_lengths / 1000)
ncrpk = apply(ncrpk, c(1,2), function(x){if(is.nan(x)){0}else{x}})
ncrpk = apply(ncrpk, c(1,2), function(x){if(is.infinite(x)){0}else{x}})
ncscalingfactor = colSums(ncrpk) / 1e6
nctpm = sweep(ncrpk, 2, ncscalingfactor, "/")

nctpm.melt = melt(nctpm)
ggplot(nctpm.melt, aes(x=Var2, y=value)) + geom_boxplot() + theme_classic() + theme(axis.text.x = element_text(angle = 90, colour="black", hjust = 1)) + scale_x_discrete("Sample") + scale_y_continuous("TPM")

tpm.threshold = 10000
test.tpm = apply(nctpm, 1, function(x){ any(x> tpm.threshold) })
ensembl.genes[names(test.tpm[test.tpm])] %>%
  as.data.frame() %>%
  datatable(options = list(scrollX = TRUE))

Differential Expression Analysis

Wald Tests

OKSM vs Control(TdTom)

res1 <- results(dds_wald, contrast = c( "condition", "OKSM", "Control"), test="Wald")
res <- lfcShrink(dds = dds_wald, res = res1, type = "normal", contrast = c( "condition", "OKSM", "Control"))
res$gene_biotype <- ensembl.genes$gene_biotype[match(row.names(res1), ensembl.genes$gene_id)]
res$external_gene_name <- ensembl.genes$external_gene_name[match(row.names(res1), ensembl.genes$gene_id)]
print(paste("Number of significant genes (padj < 0.1 & log2FoldChange < log2(1.5)):", sum(res$padj < 0.1 & abs(res$log2FoldChange) > lfc.threshold, na.rm = T)))
## [1] "Number of significant genes (padj < 0.1 & log2FoldChange < log2(1.5)): 271"
hist(res$pvalue)

write_files(res, "OKSM", "Control")
EnhancedVolcano(res, lab=res$external_gene_name, pCutoff = 0.05, FCcutoff = lfc.threshold,
                  x = "log2FoldChange", y = "padj",
                  title = NULL,
                  subtitle = NULL,
                  legendPosition = 'right',
                  legendLabSize = 8)

Visualisation

# assay(x) to access the count data
assay(significant_rld) <- limma::removeBatchEffect(assay(significant_rld), covariates = cov)
sig_mat_rld = assay(significant_rld)

# The apply function swaps the rows to samples and columns to genes -- the standard is the other way around: samples in cols and genes in rows, hence the transpose function
zscores = t(apply(sig_mat_rld, 1, function(x){ (x - mean(x)) / sd(x) }))

Heatmap

pam_clust <- generate_data(zscores, 2, "pam")
# pam_clust <- as.data.frame(pam_clust)
# pam_clust$Cluster <- factor(pam_clust$Cluster, levels = c(5,4,3,1,2,6))
# pam_clust <- pam_clust[order(pam_clust$Cluster),]

pheatmap(pam_clust[,1:(ncol(pam_clust)-1)],
         color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(100),
         fontsize_row = 5.5,
         annotation_col = annotation,
         annotation_colors = anno_colours,
         cluster_rows = FALSE,
         cluster_cols = FALSE)

Table of Genes

pam_clust_df <- pam_clust %>%
  as.data.frame() %>%
  mutate(gene_name = ensembl.genes[rownames(.),]$external_gene_name) %>%
  dplyr::select(gene_name, Cluster) %>%
  dplyr::rename("Gene Name" = gene_name)
datatable(pam_clust_df, options = list(scrollX = TRUE), class = "white-space: nowrap")

Number of Genes

c1 <- pam_clust_df[pam_clust_df$Cluster == 1, ] %>%
  dplyr::select(-Cluster)

c2 <- pam_clust_df[pam_clust_df$Cluster == 2, ] %>%
  dplyr::select(-Cluster)

data.frame(Cluster = c("Cluster 1", "Cluster 2", "Total"),
                        Number_of_genes = c(nrow(c1), nrow(c2),
                                            sum(c(nrow(c1),nrow(c2)))))
##     Cluster Number_of_genes
## 1 Cluster 1             176
## 2 Cluster 2              95
## 3     Total             271

Silhouette Plot

## Checking the stability of clusters
#fviz_nbclust(zscores, kmeans, method = "silhouette")
set.seed(2)
dd = as.dist((1 - cor(t(zscores)))/2)
pam = pam(dd, 2, diss = TRUE)
sil = silhouette(pam$clustering, dd)
plot(sil, border=NA, main = "Silhouette Plot for 2 Clusters")

Functional Enrichments

#listEnrichrSites()
setEnrichrSite("FlyEnrichr")
dbs <- listEnrichrDbs()
to_check <- c("GO_Biological_Process_2018", "KEGG_2019")

Cluster 1

GO_Biological_Process_2018

eresList$GO_Biological_Process_2018 %>%
  plot_enrichr("GO_Biological_Process_2018")

datatable(eresList[[1]], options = list(scrollX = TRUE), class = "white-space: nowrap")

KEGG_2019

eresList$KEGG_2019 %>%
  plot_enrichr("KEGG_2019") 

datatable(eresList[[2]], options = list(scrollX = TRUE), class = "white-space: nowrap")

Cluster 2

GO_Biological_Process_2018

eresList$GO_Biological_Process_2018 %>%
  plot_enrichr("GO_Biological_Process_2018")

datatable(eresList[[1]], options = list(scrollX = TRUE), class = "white-space: nowrap")

KEGG_2019

eresList$KEGG_2019 %>%
  plot_enrichr("KEGG_2019") 

datatable(eresList[[2]], options = list(scrollX = TRUE), class = "white-space: nowrap")

Motif Enrichment Analysis

Cluster 1

c1_ame <- read.delim("ame/pairwise/output/c1/0.3/ame.tsv")
c1_ame %>%
  dplyr::select(-motif_DB, -motif_ID) %>%
  datatable(options = list(scrollX = TRUE))

Cluster 2

c2_ame <- read.delim("ame/pairwise/output/c2/0.3/ame.tsv")
c2_ame %>%
  dplyr::select(-motif_DB, -motif_ID) %>%
  datatable(options = list(scrollX = TRUE))

Session Info

sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] sva_3.42.0                            BiocParallel_1.28.0                  
##  [3] mgcv_1.8-38                           nlme_3.1-153                         
##  [5] enrichR_3.0                           DT_0.19                              
##  [7] cowplot_1.1.1                         RColorBrewer_1.1-2                   
##  [9] scales_1.1.1                          reshape2_1.4.4                       
## [11] knitr_1.36                            biomaRt_2.50.0                       
## [13] GenomicFeatures_1.46.1                AnnotationDbi_1.56.2                 
## [15] genefilter_1.76.0                     DESeq2_1.34.0                        
## [17] SummarizedExperiment_1.24.0           Biobase_2.54.0                       
## [19] MatrixGenerics_1.6.0                  matrixStats_0.61.0                   
## [21] BSgenome.Dmelanogaster.UCSC.dm6_1.4.1 BSgenome_1.62.0                      
## [23] rtracklayer_1.54.0                    GenomicRanges_1.46.0                 
## [25] Biostrings_2.62.0                     GenomeInfoDb_1.30.0                  
## [27] XVector_0.34.0                        IRanges_2.28.0                       
## [29] S4Vectors_0.32.2                      BiocGenerics_0.40.0                  
## [31] forcats_0.5.1                         stringr_1.4.0                        
## [33] dplyr_1.0.7                           purrr_0.3.4                          
## [35] readr_2.0.2                           tidyr_1.1.4                          
## [37] tibble_3.1.6                          tidyverse_1.3.1                      
## [39] EnhancedVolcano_1.12.0                ggrepel_0.9.1                        
## [41] ggplot2_3.3.5                         pheatmap_1.0.12                      
## [43] cluster_2.1.2                        
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1             backports_1.3.0          BiocFileCache_2.2.0     
##   [4] plyr_1.8.6               splines_4.1.2            crosstalk_1.2.0         
##   [7] digest_0.6.28            htmltools_0.5.2          fansi_0.5.0             
##  [10] magrittr_2.0.1           memoise_2.0.0            limma_3.50.0            
##  [13] tzdb_0.2.0               annotate_1.72.0          modelr_0.1.8            
##  [16] extrafont_0.17           extrafontdb_1.0          prettyunits_1.1.1       
##  [19] colorspace_2.0-2         blob_1.2.2               rvest_1.0.2             
##  [22] rappdirs_0.3.3           haven_2.4.3              xfun_0.29               
##  [25] crayon_1.4.2             RCurl_1.98-1.5           jsonlite_1.7.2          
##  [28] survival_3.2-13          glue_1.5.0               gtable_0.3.0            
##  [31] zlibbioc_1.40.0          DelayedArray_0.20.0      proj4_1.0-10.1          
##  [34] car_3.0-12               Rttf2pt1_1.3.9           maps_3.4.0              
##  [37] abind_1.4-5              edgeR_3.36.0             DBI_1.1.1               
##  [40] rstatix_0.7.0            Rcpp_1.0.7               xtable_1.8-4            
##  [43] progress_1.2.2           bit_4.0.4                htmlwidgets_1.5.4       
##  [46] httr_1.4.2               ellipsis_0.3.2           farver_2.1.0            
##  [49] pkgconfig_2.0.3          XML_3.99-0.8             sass_0.4.0              
##  [52] dbplyr_2.1.1             locfit_1.5-9.4           utf8_1.2.2              
##  [55] labeling_0.4.2           tidyselect_1.1.1         rlang_0.4.12            
##  [58] munsell_0.5.0            cellranger_1.1.0         tools_4.1.2             
##  [61] cachem_1.0.6             cli_3.1.0                generics_0.1.1          
##  [64] RSQLite_2.2.8            broom_0.7.10             evaluate_0.14           
##  [67] fastmap_1.1.0            yaml_2.2.1               bit64_4.0.5             
##  [70] fs_1.5.0                 KEGGREST_1.34.0          ash_1.0-15              
##  [73] ggrastr_1.0.1            xml2_1.3.2               compiler_4.1.2          
##  [76] rstudioapi_0.13          beeswarm_0.4.0           filelock_1.0.2          
##  [79] curl_4.3.2               png_0.1-7                ggsignif_0.6.3          
##  [82] reprex_2.0.1             geneplotter_1.72.0       bslib_0.3.1             
##  [85] stringi_1.7.5            highr_0.9                ggalt_0.4.0             
##  [88] lattice_0.20-45          Matrix_1.3-4             vctrs_0.3.8             
##  [91] pillar_1.6.4             lifecycle_1.0.1          jquerylib_0.1.4         
##  [94] bitops_1.0-7             R6_2.5.1                 BiocIO_1.4.0            
##  [97] KernSmooth_2.23-20       vipor_0.4.5              codetools_0.2-18        
## [100] MASS_7.3-54              assertthat_0.2.1         rprojroot_2.0.2         
## [103] rjson_0.2.20             withr_2.4.2              GenomicAlignments_1.30.0
## [106] Rsamtools_2.10.0         GenomeInfoDbData_1.2.7   parallel_4.1.2          
## [109] hms_1.1.1                grid_4.1.2               rmarkdown_2.11          
## [112] carData_3.0-4            ggpubr_0.4.0             lubridate_1.8.0         
## [115] ggbeeswarm_0.6.0         restfulr_0.0.13